Note: Using vst normalisation (just for performing the DEA from the limma package)
# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
print('Columns in datExpr don\'t match the rowd in datMeta!')
}
# Annotate probes
getinfo = c('ensembl_gene_id','external_gene_id','chromosome_name','start_position',
'end_position','strand','band','gene_biotype','percentage_gc_content')
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
datProbes = getBM(attributes=getinfo, filters=c('ensembl_gene_id'), values=rownames(datExpr), mart=mart)
datProbes = datProbes[match(rownames(datExpr), datProbes$ensembl_gene_id),]
datProbes$length = datProbes$end_position-datProbes$start_position
# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
#################################################################################
# FILTERS:
# 1 Filter probes with start or end position missing (filter 5)
# These can be filtered without probe info, they have weird IDS that don't start with ENS
to_keep = !is.na(datProbes$length)
datProbes = datProbes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datProbes) = datProbes$ensembl_gene_id
# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
# 3. Filter samples with rowSums <= 40
to_keep = rowSums(datExpr)>40
datExpr = datExpr[to_keep,]
datProbes = datProbes[to_keep,]
#################################################################################
# Annotate SFARI genes
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol')
#################################################################################
# Add functional annotation to genes from GO
GO_annotations = read.csv('./../working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal' = 1)
datExpr_backup = datExpr
rm(getinfo, to_keep, gene_names, mart, GO_annotations)
Calculating the mean expression of the genes that don’t have a neuronal functional annotation nor a SFARI score for ADS and CTL separately and plotting ASD vs CTL with log2 scale on both axis we can see that the fitted line has a slope close to 0.5
housekeeping_genes = rownames(datExpr)[!rownames(datExpr) %in% GO_neuronal$ID &
!rownames(datExpr) %in% SFARI_genes$ID]
housekeeping_genes_ASD = datExpr %>% dplyr::select(which(datMeta$Diagnosis_=='ASD')) %>%
filter(rownames(.) %in% housekeeping_genes) %>% rowMeans
housekeeping_genes_CTL = datExpr %>% dplyr::select(which(datMeta$Diagnosis_!='ASD')) %>%
filter(rownames(.) %in% housekeeping_genes) %>% rowMeans
ASD_vs_CTL = data.frame('ID'=housekeeping_genes, 'ASD'=housekeeping_genes_ASD+1, 'CTL'=housekeeping_genes_CTL+1)
ggplotly(ASD_vs_CTL %>% ggplot(aes(ASD, CTL)) + geom_point(alpha=0.1, aes(id=ID)) + coord_fixed() +
geom_smooth(method=lm, se=F, color='#009999') + theme_minimal() +
scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2'))
rm(housekeeping_genes_ASD, housekeeping_genes_CTL)
Fit a linear model between ASD and CTL for these non-neuronal, non-SFARI genes and use the slope and intercept to transform the ASD expression to match the one from the CTL samples.
fit = lm(log2(ASD) ~ log2(CTL), ASD_vs_CTL)
slope = summary(fit)$coefficients[2,1]
intercept = summary(fit)$coefficients[1,1]
# print(glue('Intercept = ', round(intercept,2), ' and slope = ', round(slope,2)))
datExpr_ASD = datExpr %>% dplyr::select(which(datMeta$Diagnosis_=='ASD'))
datExpr_CTL = datExpr %>% dplyr::select(which(datMeta$Diagnosis_!='ASD'))
datExpr_ASD = (datExpr_ASD-intercept)/slope
datExpr_fix = bind_cols(datExpr_ASD, datExpr_CTL)
datExpr_fix = datExpr_fix[,match(colnames(datExpr),colnames(datExpr_fix))]
rownames(datExpr_fix) = rownames(datExpr)
min_val = datExpr_fix %>% apply(1, min) %>% min
if(min_val<0) datExpr_fix = datExpr_fix + abs(min_val)
datExpr_fix = floor(datExpr_fix) # Round doesn't work I don't know why!
housekeeping_genes_ASD = datExpr_fix %>% dplyr::select(which(datMeta$Diagnosis_=='ASD')) %>%
filter(rownames(.) %in% housekeeping_genes) %>% rowMeans
housekeeping_genes_CTL = datExpr_fix %>% dplyr::select(which(datMeta$Diagnosis_!='ASD')) %>%
filter(rownames(.) %in% housekeeping_genes) %>% rowMeans
ASD_vs_CTL = data.frame('ASD' = housekeeping_genes_ASD+1, 'CTL' = housekeeping_genes_CTL+1)
ggplotly(ASD_vs_CTL %>% ggplot(aes(ASD, CTL)) + geom_point(alpha=0.1) + coord_fixed() +
geom_smooth(method=lm, se=F, color='#009999') + scale_x_continuous(trans='log2') +
scale_y_continuous(trans='log2') + theme_minimal())
rm(fit, slope, intercept, datExpr_ASD, datExpr_CTL, housekeeping_genes,
housekeeping_genes_ASD, housekeeping_genes_CTL, ASD_vs_CTL)
################################################################################
# Calculate Differential Expression using limma and DESeq2 packages
datExpr = datExpr_fix
# Limma
if(!file.exists('./../working_data/genes_ASD_DE_info_limma_fix.csv')){
# First perform vst normalisation
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design =~Diagnosis_)
dds = estimateSizeFactors(dds)
vst_output = vst(dds)
datExpr_vst = assay(vst_output)
# Calculate DE of normalised data
mod = model.matrix(~ Diagnosis_, data=datMeta)
corfit = duplicateCorrelation(datExpr_vst, mod, block=datMeta$Subject_ID)
lmfit = lmFit(datExpr_vst, design=mod, block=datMeta$Subject_ID, correlation=corfit$consensus)
fit = eBayes(lmfit, trend=T, robust=T)
top_genes = topTable(fit, coef=2, number=nrow(datExpr_vst))
DE_info_limma = top_genes[match(rownames(datExpr_vst), rownames(top_genes)),] %>%
rownames_to_column(var = 'ID') %>%
mutate('logFC_limma'=logFC, 'adj.P.Val_limma'=adj.P.Val) %>%
dplyr::select(ID, logFC_limma, adj.P.Val_limma)
write.csv(DE_info_limma, './../working_data/genes_ASD_DE_info_limma_fix.csv', row.names = FALSE)
rm(mod, corfit, lmfit, fit, top_genes)
} else DE_info_limma = read.csv('./../working_data/genes_ASD_DE_info_limma_fix.csv')
# DESeq2 DE
if(!file.exists('./../working_data/genes_ASD_DE_info_DESeq2_fix.csv')){
counts = as.matrix(datExpr)
rowRanges = GRanges(datProbes$chromosome_name,
IRanges(datProbes$start_position, width=datProbes$length),
strand=datProbes$strand,
feature_id=datProbes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
ddsSE = DESeqDataSet(se, design =~Diagnosis_)
dds = DESeq(ddsSE)
DE_info_DESeq2 = results(dds) %>% data.frame %>% rownames_to_column(var = 'ID') %>%
mutate('logFC_DESeq2'=log2FoldChange, 'adj.P.Val_DESeq2'=padj) %>%
dplyr::select(ID, logFC_DESeq2, adj.P.Val_DESeq2)
write.csv(DE_info_DESeq2, './../working_data/genes_ASD_DE_info_DESeq2_fix.csv', row.names = FALSE)
rm(counts, rowRanges, se, ddsSE, dds)
} else DE_info_DESeq2 = read.csv('./../working_data/genes_ASD_DE_info_DESeq2_fix.csv')
DE_info = DE_info_limma %>% left_join(DE_info_DESeq2, by='ID')
rm(DE_info_limma, DE_info_DESeq2)
ggplotly(DE_info %>% ggplot(aes(logFC_DESeq2, logFC_limma)) + geom_point(alpha=0.05, aes(id=ID)) +
geom_vline(xintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_vline(xintercept= log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept= log2(1.2), color='#cc0099', size=0.5) +
ggtitle(glue('LogFC (corr=',round(cor(DE_info$logFC_limma, DE_info$logFC_DESeq2),2),')')) +
geom_smooth(method=lm, se=F, color='#009999', size=0.5) + coord_fixed() +
theme_minimal())
Although both functions use the BH method to adjust for multiple testing, the limma package seems to be more strict, assigning a high p-value to many of the points that DESeq2 considers statistically significant
DE_info %>% ggplot(aes(adj.P.Val_DESeq2, adj.P.Val_limma)) + geom_point(alpha=0.05, aes(id=ID)) +
geom_vline(xintercept=0.05, color='#cc0099') +
geom_hline(yintercept=0.05, color='#cc0099') +
geom_smooth(method=lm, se=F, color='#009999', size=0.5) + coord_fixed() +
ggtitle(glue('Adjusted p-values (corr=',round(cor(DE_info$adj.P.Val_limma,
DE_info$adj.P.Val_DESeq2),2),')')) +
theme_minimal()
signif_genes = data.frame('limma Significance' = DE_info$adj.P.Val_limma<0.05 & DE_info$logFC_limma>log2(1.2),
'DESeq2 Significance' = DE_info$adj.P.Val_DESeq2<0.05 & DE_info$logFC_DESeq2>log2(1.2),
'ID' = DE_info$ID)
signif_genes %>% dplyr::select(-ID) %>% table
## DESeq2.Significance
## limma.Significance FALSE TRUE
## FALSE 31559 512
## TRUE 178 1229
signif_genes %>% dplyr::select(-ID) %>% table/nrow(DE_info)*100
## DESeq2.Significance
## limma.Significance FALSE TRUE
## FALSE 94.2678774 1.5293626
## TRUE 0.5316925 3.6710676
print(glue(round(sum(signif_genes$limma.Significance & !signif_genes$DESeq2.Significance)/sum(signif_genes$limma.Significance)*100,2),
'% of the statistically significant limma genes are not significant for DESeq2.
', round(sum(!signif_genes$limma.Significance & signif_genes$DESeq2.Significance)/sum(signif_genes$DESeq2.Significance)*100,2),
'% of the statistically significant DESeq2 genes are not significant for limma.'))
## 12.65% of the statistically significant limma genes are not significant for DESeq2.
## 29.41% of the statistically significant DESeq2 genes are not significant for limma.
SFARI score distribution of genes classified as significant by both methods
both_methods = data.frame('ID'=signif_genes %>% filter(limma.Significance & DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(both_methods$`gene-score`, useNA='ifany')
##
## 1 2 3 4 5 6 <NA>
## 2 3 17 44 27 2 1134
round(table(both_methods$`gene-score`, useNA='ifany')/nrow(both_methods)*100,2)
##
## 1 2 3 4 5 6 <NA>
## 0.16 0.24 1.38 3.58 2.20 0.16 92.27
rm(both_methods)
SFARI score distribution of genes classified as significant by limma but not DESeq2
only_limma = data.frame('ID'=signif_genes %>% filter(limma.Significance & !DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(only_limma$`gene-score`, useNA='ifany')
##
## 2 3 4 5 6 <NA>
## 2 2 4 5 1 164
round(table(only_limma$`gene-score`, useNA='ifany')/nrow(only_limma)*100,2)
##
## 2 3 4 5 6 <NA>
## 1.12 1.12 2.25 2.81 0.56 92.13
rm(only_limma)
SFARI score distribution of genes classified as significant by DESeq2 but not limma
only_DESeq2 = data.frame('ID'=signif_genes %>% filter(!limma.Significance & DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(only_DESeq2$`gene-score`, useNA='ifany')
##
## 2 3 4 5 <NA>
## 2 3 7 3 497
round(table(only_DESeq2$`gene-score`, useNA='ifany')/nrow(only_DESeq2)*100,2)
##
## 2 3 4 5 <NA>
## 0.39 0.59 1.37 0.59 97.07
rm(only_DESeq2)
SFARI score distribution of genes classified as not significant by both methods
neither = data.frame('ID'=signif_genes %>% filter(!limma.Significance & !DESeq2.Significance) %>% dplyr::select(ID)) %>%
left_join(SFARI_genes, by='ID') %>% dplyr::select(ID, `gene-score`)
table(neither$`gene-score`, useNA='ifany')
##
## 1 2 3 4 5 6 <NA>
## 22 53 166 378 132 21 30788
round(table(neither$`gene-score`, useNA='ifany')/nrow(neither)*100,2)
##
## 1 2 3 4 5 6 <NA>
## 0.07 0.17 0.53 1.20 0.42 0.07 97.55
rm(neither, signif_genes)
limma_orig = read.csv('./../working_data/genes_ASD_DE_info_limma.csv')
colnames(limma_orig) = c('ID','logFC_orig', 'adj.P.Val_orig')
limma_fix = read.csv('./../working_data/genes_ASD_DE_info_limma_fix.csv')
colnames(limma_fix) = c('ID','logFC_fix', 'adj.P.Val_fix')
limma_both = limma_orig %>% left_join(limma_fix, by='ID')
ggplotly(limma_both %>% ggplot(aes(logFC_orig, logFC_fix)) + geom_point(alpha=0.05, aes(id=ID)) + coord_fixed() +
geom_vline(xintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_vline(xintercept= log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept= log2(1.2), color='#cc0099', size=0.5) +
theme_minimal()+ ggtitle(glue('LogFC (corr=',round(cor(limma_both$logFC_orig,
limma_both$logFC_fix),4),')')))
ggplotly(limma_both %>% ggplot(aes(adj.P.Val_orig, adj.P.Val_fix)) + geom_point(alpha=0.05, aes(id=ID)) +
geom_vline(xintercept=0.05, color='#cc0099') + geom_hline(yintercept=0.05, color='#cc0099') +
theme_minimal()+ ggtitle(glue('adjusted p-value (corr=',round(cor(limma_both$adj.P.Val_orig,
limma_both$adj.P.Val_fix),4),')')) + coord_fixed())
rm(limma_orig, limma_fix)
limma_comp = data.frame('orig_significant' = limma_both$logFC_orig>log2(1.2) & limma_both$adj.P.Val_orig<0.05,
'fix_significant' = limma_both$logFC_fix>log2(1.2) & limma_both$adj.P.Val_fix<0.05)
table(limma_comp)
## fix_significant
## orig_significant FALSE TRUE
## FALSE 32033 16
## TRUE 38 1391
DESeq2_orig = read.csv('./../working_data/genes_ASD_DE_info_DESeq2.csv') %>% dplyr::select(-X)
colnames(DESeq2_orig) = c('ID','logFC_orig', 'adj.P.Val_orig')
DESeq2_fix = read.csv('./../working_data/genes_ASD_DE_info_DESeq2_fix.csv')
colnames(DESeq2_fix) = c('ID','logFC_fix', 'adj.P.Val_fix')
DESeq2_both = DESeq2_orig %>% left_join(DESeq2_fix, by='ID')
ggplotly(DESeq2_both %>% ggplot(aes(logFC_orig, logFC_fix)) + geom_point(alpha=0.05, aes(id=ID)) + coord_fixed() +
geom_vline(xintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept=-log2(1.2), color='#cc0099', size=0.5) +
geom_vline(xintercept= log2(1.2), color='#cc0099', size=0.5) +
geom_hline(yintercept= log2(1.2), color='#cc0099', size=0.5) +
theme_minimal() + ggtitle(glue('LogFC (corr=',round(cor(DESeq2_both$logFC_orig,
DESeq2_both$logFC_fix),4),')')))